import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| customerID | 7590-VHVEG | 5575-GNVDE | 3668-QPYBK | 7795-CFOCW | 9237-HQITU |
| gender | Female | Male | Male | Male | Female |
| SeniorCitizen | 0 | 0 | 0 | 0 | 0 |
| Partner | Yes | No | No | No | No |
| Dependents | No | No | No | No | No |
| tenure | 1 | 34 | 2 | 45 | 2 |
| PhoneService | No | Yes | Yes | No | Yes |
| MultipleLines | No phone service | No | No | No phone service | No |
| InternetService | DSL | DSL | DSL | DSL | Fiber optic |
| OnlineSecurity | No | Yes | Yes | Yes | No |
| OnlineBackup | Yes | No | Yes | No | No |
| DeviceProtection | No | Yes | No | Yes | No |
| TechSupport | No | No | No | Yes | No |
| StreamingTV | No | No | No | No | No |
| StreamingMovies | No | No | No | No | No |
| Contract | Month-to-month | One year | Month-to-month | One year | Month-to-month |
| PaperlessBilling | Yes | No | Yes | No | Yes |
| PaymentMethod | Electronic check | Mailed check | Mailed check | Bank transfer (automatic) | Electronic check |
| MonthlyCharges | 29.85 | 56.95 | 53.85 | 42.3 | 70.7 |
| TotalCharges | 29.85 | 1889.5 | 108.15 | 1840.75 | 151.65 |
| Churn | No | No | Yes | No | Yes |
df.describe()
| SeniorCitizen | tenure | MonthlyCharges | |
|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7043.000000 |
| mean | 0.162147 | 32.371149 | 64.761692 |
| std | 0.368612 | 24.559481 | 30.090047 |
| min | 0.000000 | 0.000000 | 18.250000 |
| 25% | 0.000000 | 9.000000 | 35.500000 |
| 50% | 0.000000 | 29.000000 | 70.350000 |
| 75% | 0.000000 | 55.000000 | 89.850000 |
| max | 1.000000 | 72.000000 | 118.750000 |
shape =df.shape
print(shape)
(7043, 21)
missing_values_count = df.isnull().sum()
print(missing_values_count)
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
after exploring I found that there is no missing values in the dataset
# how many total missing values do we have?
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()
# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)
0.0
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.1+ MB
Dropping the customerID its not an important feature
df.drop(columns="customerID",inplace=True)
df["TotalCharges"].describe()
count 7043 unique 6531 top freq 11 Name: TotalCharges, dtype: object
len(df[df['TotalCharges']==' '])
11
Convert the "TotalCharges" column to numeric and enforce white spaces to be 'NaN'
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
len(df[df['TotalCharges']==' '])
0
df.isnull().sum()
gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 11 Churn 0 dtype: int64
missing_values_count2 = df.isnull().sum()
# how many total missing values do we have?
total_cells = np.product(df.shape)
total_missing = missing_values_count2.sum()
# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)
0.007809172227743859
as the null precantage almost Zero % so will drop all null
df.dropna(inplace=True)
SeniorCitizen to be treated as category
df['SeniorCitizen']=df['SeniorCitizen'].astype('object')
convert our target column from object to int
df['Churn'].unique()
array(['No', 'Yes'], dtype=object)
df[df['Churn']=='Yes'].head(3)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 4 | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
| 5 | Female | 0 | No | No | 8 | Yes | Yes | Fiber optic | No | No | Yes | No | Yes | Yes | Month-to-month | Yes | Electronic check | 99.65 | 820.50 | Yes |
df['Churn']=(df.Churn=='Yes').astype("int")
The dataset is unbalanced in a near about 3 : 1 ratio for Not-Churn : Churn customers!
Due to this, predictions will be biased towards Not-Churn customers. Visualizations will also display this bias!
df['Churn'].value_counts()
0 5163 1 1869 Name: Churn, dtype: int64
churn_counts = df['Churn'].value_counts()
# Create a bar plot
plt.figure(figsize=(4,4))
plt.bar(churn_counts.index, churn_counts.values, color=['blue', 'red'])
plt.title('Churn Distribution')
plt.xlabel('Churn Status (0: No, 1: Yes)')
plt.ylabel('Count')
plt.xticks(churn_counts.index, ['No Churn', 'Churn'])
plt.show()
Viualizing the dataset to know the relation between the target and the features
df["PaymentMethod"].unique()
array(['Electronic check', 'Mailed check', 'Bank transfer (automatic)',
'Credit card (automatic)'], dtype=object)
# Get the list of column names
columns = df.columns.tolist()
# Initialize empty lists for numerical and categorical variables
numerical= []
categorical= []
# Iterate through columns
for col in columns:
if df[col].dtype == 'object':
categorical.append(col)
else:
numerical.append(col)
# Print the lists
print("Numerical Variables:")
print(numerical)
print("\nCategorical Variables:")
print(categorical)
Numerical Variables: ['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn'] Categorical Variables: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
# Define the list of features
features = ['gender', 'SeniorCitizen', 'Partner',
'Dependents', 'PhoneService', 'MultipleLines',
'InternetService', 'OnlineSecurity', 'OnlineBackup',
'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
'Contract', 'PaperlessBilling', 'PaymentMethod']
# Define the target variable
target = 'Churn' # Assuming the target variable is 'Churn'
# Set up subplots for visualizing multiple features
fig, axes = plt.subplots(nrows=len(features), ncols=1, figsize=(10, 5 * len(features)))
# Iterate through features and create bar plots
for i, feature in enumerate(features):
sns.countplot(data=df, x=feature, hue=target, ax=axes[i])
axes[i].set_title(f'{feature} vs {target}')
# Adjust layout
plt.tight_layout()
plt.show()
for i,feature in enumerate(features):
total_counts = df[feature].value_counts()
churn_counts = df[df[target] == 1][feature].value_counts()
percentages = (churn_counts / total_counts * 100).fillna(0)
print(f'{feature} vs {target}')
print(f'{percentages}')
print("-------------------------------------")
gender vs Churn Female 26.959518 Male 26.204565 Name: gender, dtype: float64 ------------------------------------- SeniorCitizen vs Churn 0 23.650255 1 41.681261 Name: SeniorCitizen, dtype: float64 ------------------------------------- Partner vs Churn No 32.976092 Yes 19.717065 Name: Partner, dtype: float64 ------------------------------------- Dependents vs Churn No 31.279140 Yes 15.531205 Name: Dependents, dtype: float64 ------------------------------------- PhoneService vs Churn Yes 26.747481 No 25.000000 Name: PhoneService, dtype: float64 ------------------------------------- MultipleLines vs Churn No 25.081241 No phone service 25.000000 Yes 28.648466 Name: MultipleLines, dtype: float64 ------------------------------------- InternetService vs Churn Fiber optic 41.892765 DSL 18.998344 No 7.434211 Name: InternetService, dtype: float64 ------------------------------------- OnlineSecurity vs Churn No 41.778667 Yes 14.640199 No internet service 7.434211 Name: OnlineSecurity, dtype: float64 ------------------------------------- OnlineBackup vs Churn No 39.941691 Yes 21.567010 No internet service 7.434211 Name: OnlineBackup, dtype: float64 ------------------------------------- DeviceProtection vs Churn No 39.140271 Yes 22.539289 No internet service 7.434211 Name: DeviceProtection, dtype: float64 ------------------------------------- TechSupport vs Churn No 41.647465 Yes 15.196078 No internet service 7.434211 Name: TechSupport, dtype: float64 ------------------------------------- StreamingTV vs Churn No 33.535066 Yes 30.114687 No internet service 7.434211 Name: StreamingTV, dtype: float64 ------------------------------------- StreamingMovies vs Churn No 33.728875 Yes 29.952398 No internet service 7.434211 Name: StreamingMovies, dtype: float64 ------------------------------------- Contract vs Churn Month-to-month 42.709677 One year 11.277174 Two year 2.848665 Name: Contract, dtype: float64 ------------------------------------- PaperlessBilling vs Churn Yes 33.589251 No 16.375698 Name: PaperlessBilling, dtype: float64 ------------------------------------- PaymentMethod vs Churn Electronic check 45.285412 Mailed check 19.201995 Bank transfer (automatic) 16.731518 Credit card (automatic) 15.253123 Name: PaymentMethod, dtype: float64 -------------------------------------
import matplotlib.pyplot as plt
# Assuming you have a DataFrame called 'df' with a 'Gender' column
gender_counts = df['gender'].value_counts()
# Create labels and sizes for the pie chart
labels = gender_counts.index
sizes = gender_counts.values
# Create the pie chart
plt.figure(figsize=(4,4))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff'])
plt.title('Gender Distribution')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
import matplotlib.pyplot as plt
# Assuming you have a DataFrame called 'df' with a 'Gender' column
SeniorCitizen_counts = df['SeniorCitizen'].value_counts()
# Create labels and sizes for the pie chart
labels = ['Not Senior Citizen', 'Senior Citizen']
sizes = SeniorCitizen_counts.values
# Create the pie chart
plt.figure(figsize=(4,4))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff'])
plt.title('Gender Distribution')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
import matplotlib.pyplot as plt
# Assuming you have a DataFrame called 'df' with a 'Gender' column
Partner_counts = df['Partner'].value_counts()
# Create labels and sizes for the pie chart
labels = ['Not Partner', 'Partner ']
sizes = Partner_counts.values
# Create the pie chart
plt.figure(figsize=(4,4))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff'])
plt.title('Partner Distribution')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
import plotly.express as px
import plotly.graph_objects as go
labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()
df[numerical].corr()
| tenure | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|
| tenure | 1.000000 | 0.246862 | 0.825880 | -0.354049 |
| MonthlyCharges | 0.246862 | 1.000000 | 0.651065 | 0.192858 |
| TotalCharges | 0.825880 | 0.651065 | 1.000000 | -0.199484 |
| Churn | -0.354049 | 0.192858 | -0.199484 | 1.000000 |
#relation with the target [churn]
plt.figure(figsize=(5,5))
sns.heatmap(df[numerical].corr(),annot=True,cmap='YlGnBu')
<Axes: >
df.hist(figsize=(8,8))
array([[<Axes: title={'center': 'tenure'}>,
<Axes: title={'center': 'MonthlyCharges'}>],
[<Axes: title={'center': 'TotalCharges'}>,
<Axes: title={'center': 'Churn'}>]], dtype=object)
df[numerical].corrwith(df.Churn)
tenure -0.354049 MonthlyCharges 0.192858 TotalCharges -0.199484 Churn 1.000000 dtype: float64
Tenure has negative correlation with churn which makes sense , the longer customers stay, the less often they tend to churn
# Visualizing Churn Rate by Tenure
import seaborn as sns
import matplotlib.pyplot as plt
# 2 months or less
t1 = df[df['tenure'] <= 2].Churn.mean()
# between 3 to 12 months
t2 = df[(df['tenure'] >= 3) & (df['tenure'] <= 12)].Churn.mean()
# more than 12 months
t3 = df[df['tenure'] > 12].Churn.mean()
# Create the bar plot
sns.barplot(x=['1-2', '3-12', '+12'], y=[t1, t2, t3], palette='Greens')
plt.title('Churn Rate by Tenure')
plt.xlabel('Tenure')
plt.ylabel('Churn Rate')
plt.show()
Monthly Charges has positive correlation with our target
means that the customers who pay more tend more to chrun
# less than or equal to 20
mc1 = df[df['MonthlyCharges'] <= 20].Churn.mean()
# between 21 and 50
mc2 = tc2 = df[(df.MonthlyCharges >= 21) & (df.MonthlyCharges <= 50)].Churn.mean()
# more than 50
mc3 = df[df['MonthlyCharges'] > 50].Churn.mean()
sns.barplot(x =['0-20', '21-50', '+50'], y =[mc1,mc2,mc3], palette='Greens')
plt.title('Churn Rate by Monthly Charges')
plt.xlabel('Monthly Charges')
plt.ylabel('Churn Rate');
TotalCharges has negative correlation
The longer people stay with the company, the more they have paid in total, so it’s less likely that they will leave.
# less than or equal to 1000
tc1 = df[df['TotalCharges'] <= 1000].Churn.mean()
# between 1000 and 5000
tc2 = tc2 = df[(df.TotalCharges > 1000) & (df.TotalCharges <= 5000)].Churn.mean()
# more than 5000
tc3 = df[df['TotalCharges'] > 5000].Churn.mean()
# Create the bar plot
sns.barplot(x =['0-1000', '1000-5000', '+5000'], y =[tc1,tc2,tc3], palette='Greens')
plt.title('Churn Rate by Total Charges')
plt.xlabel('Total Charges')
plt.ylabel('Churn Rate');
feature=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
'MonthlyCharges', 'TotalCharges']
X=df[feature]
y=df[target]
X.shape
(7032, 19)
y.shape
(7032,)
for feature in categorical:
print(feature)
print(df[feature].unique())
gender ['Female' 'Male'] SeniorCitizen [0 1] Partner ['Yes' 'No'] Dependents ['No' 'Yes'] PhoneService ['No' 'Yes'] MultipleLines ['No phone service' 'No' 'Yes'] InternetService ['DSL' 'Fiber optic' 'No'] OnlineSecurity ['No' 'Yes' 'No internet service'] OnlineBackup ['Yes' 'No' 'No internet service'] DeviceProtection ['No' 'Yes' 'No internet service'] TechSupport ['No' 'Yes' 'No internet service'] StreamingTV ['No' 'Yes' 'No internet service'] StreamingMovies ['No' 'Yes' 'No internet service'] Contract ['Month-to-month' 'One year' 'Two year'] PaperlessBilling ['Yes' 'No'] PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)' 'Credit card (automatic)']
# Applying one hot encoding using Sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
ohe = OneHotEncoder(drop='first')
transformer = make_column_transformer((ohe, categorical), remainder='passthrough',
verbose_feature_names_out=False)
train = transformer.fit_transform(X)
DF= pd.DataFrame(train, columns=transformer.get_feature_names_out())
from sklearn.model_selection import train_test_split
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(DF, y, test_size=0.2, random_state=42)
numerical_new=['tenure', 'MonthlyCharges', 'TotalCharges']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(X_train[numerical_new])
array([[ 0.43467403, -0.51862031, -0.08790784],
[-1.19565249, -0.3744434 , -0.92128463],
[ 0.92377199, 0.43261589, 0.96378055],
...,
[-0.95110351, 0.5453059 , -0.65824663],
[ 0.71998118, -1.48974299, -0.56791079],
[-0.50276372, 0.29672501, -0.35842967]])
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)
LogisticRegression(random_state=1, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(random_state=1, solver='liblinear')
y_test_pred = model.predict_proba(X_test)
y_test_pred
array([[0.99147682, 0.00852318],
[0.88337147, 0.11662853],
[0.29251247, 0.70748753],
...,
[0.88729203, 0.11270797],
[0.75771288, 0.24228712],
[0.99711412, 0.00288588]])
y_pred = model.predict(X_test)
y_pred
array([0, 0, 1, ..., 0, 0, 0])
y_test_pred == y_test
2481 False
6784 False
6125 False
3052 False
4099 False
...
1733 False
5250 False
5465 False
5851 False
3984 False
Name: Churn, Length: 1407, dtype: bool
print('LogisticRegression Training Accuracy: ', round(model.score(X_train, y_train), 2))
print('LogisticRegression Testing Accuracy: ', round(model.score(X_test, y_test), 2))
LogisticRegression Training Accuracy: 0.81 LogisticRegression Testing Accuracy: 0.79
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot= True, fmt='0.0f')
<Axes: >
cm
array([[918, 115],
[184, 190]], dtype=int64)
(Tn, Fp), (Fn, Tp) = cm
print('True Negative: ', Tn)
print('False Positive: ', Fp)
print('False Negative: ', Fn)
print('True Positive: ', Tp)
True Negative: 918 False Positive: 115 False Negative: 184 True Positive: 190
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1-Score: ', f1_score(y_test, y_pred))
Accuracy: 0.7874911158493249 Precision: 0.6229508196721312 Recall: 0.5080213903743316 F1-Score: 0.5596465390279823